library(knitr)
opts_chunk$set(fig.path="./tempdata/images/",
               cache.path="cache/",
               cache=FALSE,
               echo=TRUE,message=FALSE,
               warning=FALSE) 
library(dplyr)
library(leaflet)
library(ggthemes)
library(ggplot2)
library(plotly)
library(leaflet)
library(readr)
library(RColorBrewer)
library(geosphere)
library(geojsonio)
library(sp)
library(htmltools)
library(data.table)

#Automate for entire dataset

data_replace_mean<- function(data){
  for(i in 1:ncol(data)){
  #if(nrow(data)>0)
  data[is.na(data[,i]), i] <- mean(data[,i], na.rm = TRUE)
  }
  return(data)
}

listings_col_to_keep = c("id","name","host_id","host_name","host_location","neighbourhood_cleansed","neighbourhood_group_cleansed","city","state","smart_location","latitude","longitude","property_type","room_type","accommodates","bathrooms","bedrooms","beds","price","weekly_price","monthly_price","security_deposit","cleaning_fee","review_scores_rating","review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","review_scores_value","reviews_per_month","host_listings_count","host_total_listings_count","calculated_host_listings_count","availability_365","number_of_reviews")  

df_subset_columns <- function(df){
  df_subset = subset(df,select=listings_col_to_keep)
  df_subset = data_replace_mean(df_subset)
  return(df_subset)
}

clean_data <- function(df){

  df_subset = subset(df,select=listings_col_to_keep)
  df_subset = data_replace_mean(df_subset)
  df_subset$is_commercial = NA


df_subset$is_commercial = ifelse(((df_subset$calculated_host_listings_count > 1)  &  (df_subset$availability_365 / 365 < 0.3) & 
                                  (df_subset$review_scores_rating > 50)),"Commercial Listing","Household Listing")
df_commercial = subset(df_subset,select = c("id","is_commercial"))
return (df_commercial)
}

listings_file_1 <- fread("./all Listing Data/2018_March_listings.csv")
## 
Read 26.7% of 74871 rows
Read 53.4% of 74871 rows
Read 48852 rows and 96 (of 96) columns from 0.162 GB file in 00:00:05
listings_file_2 <- fread("./all Listing Data/2018_March_listings.csv")

listings_file_1 = as.data.frame(listings_file_1)
listings_file_2 = as.data.frame(listings_file_2)

listing_1_commercial <- clean_data(listings_file_1)
listing_2_commercial <- clean_data(listings_file_2)

combined_commercial <- rbind(listing_1_commercial,listing_2_commercial)

all_commercial <- filter(combined_commercial, is_commercial=="Commercial Listing")
unique_all_commercial <- unique(all_commercial$id)

all_non_commercial <- filter(combined_commercial,(is_commercial=="Household Listing"))
unique_all_non_commercial <- data.frame(unique(all_non_commercial$id))

unique_all_non_commercial <- filter(unique_all_non_commercial, !(unique.all_non_commercial.id. %in% unique_all_commercial))

unique_all_commercial = data.frame(unique_all_commercial)

unique_all_commercial$is_commercial <- "Commercial Listing"
unique_all_non_commercial$is_commercial <- "Household Listing"

colnames(unique_all_commercial) = c("id","is_commercial")
colnames(unique_all_non_commercial) = c("id","is_commercial")

combined_unique_listings = rbind(unique_all_commercial,unique_all_non_commercial)

listings_updated_1 = df_subset_columns(listings_file_1)
listings_updated_2 = df_subset_columns(listings_file_2)

total_listing_1 = merge(listings_updated_1,combined_unique_listings,by="id",all.x = TRUE)
total_listing_2 = merge(listings_updated_2,combined_unique_listings,by="id",all.x = TRUE)

total_listings = rbind(total_listing_1,total_listing_2)
total_listings_commercial <- filter(total_listings, is_commercial=="Commercial Listing")
total_listings_non_commercial <- filter(total_listings, is_commercial=="Household Listing")


library(htmlwidgets)
library(htmltools)

review_columns <- c("review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","review_scores_value")
total_listings_commercial$avg_review_value <- ((total_listings_commercial$review_scores_accuracy + total_listings_commercial$review_scores_cleanliness + total_listings_commercial$review_scores_checkin + total_listings_commercial$review_scores_communication + total_listings_commercial$review_scores_location + total_listings_commercial$review_scores_value)/6)
total_listings_commercial$avg_review_value[is.na(total_listings_commercial$avg_review_value) ]<- 0


total_listings_commercial$avg_review_value = total_listings_commercial$avg_review_value*10

total_listings_commercial$price <- gsub("[$|,]", "", total_listings_commercial$price)
total_listings_commercial$price <- as.numeric(total_listings_commercial$price)

neighbourhood_costs <- data.frame(aggregate(total_listings_commercial$price, by=list(total_listings_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_costs) <- c("neighbourhood", "avg_cost")

neighbourhood_reviews <- data.frame(aggregate(total_listings_commercial$avg_review_value, by=list(total_listings_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_reviews) <- c("neighbourhood", "avg_review")

neighbourhood_scores <- data.frame(aggregate(total_listings_commercial$review_scores_rating, by=list(total_listings_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_scores) <- c("neighbourhood", "avg_score")

neighbourhood_merged <- merge(neighbourhood_scores, neighbourhood_reviews, by="neighbourhood")
neighbourhood_merged <- merge(neighbourhood_merged, neighbourhood_costs, by="neighbourhood")

airbnb_neighbourhoods <- geojsonio::geojson_read("./all Listing Data/neighbourhoods.geojson", what = "sp")
pal <- colorNumeric("viridis", NULL)

airbnb_neighbourhoods <- sp::merge(airbnb_neighbourhoods, neighbourhood_merged, by="neighbourhood")

 rrsc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average score for Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

rrrc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average review for Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

rrcc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average cost for Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

p_commercial_score <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_score)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_score), opacity = 1.0,title = "score")

p_commercial_review <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_review)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_review), opacity = 1.0,title = "review")

p_commercial_cost <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_cost)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_cost), opacity = 1.0,title = "cost")


review_columns <- c("review_scores_accuracy","review_scores_cleanliness","review_scores_checkin","review_scores_communication","review_scores_location","review_scores_value")
total_listings_non_commercial$avg_review_value <- (total_listings_non_commercial$review_scores_accuracy + total_listings_non_commercial$review_scores_cleanliness + total_listings_non_commercial$review_scores_checkin + total_listings_non_commercial$review_scores_communication + total_listings_non_commercial$review_scores_location + total_listings_non_commercial$review_scores_value)/6
total_listings_non_commercial$avg_review_value[is.na(total_listings_non_commercial$avg_review_value) ]<- 0


total_listings_non_commercial$avg_review_value = total_listings_non_commercial$avg_review_value*10

total_listings_non_commercial$price <- gsub("[$|,]", "", total_listings_non_commercial$price)
total_listings_non_commercial$price <- as.numeric(total_listings_non_commercial$price)

neighbourhood_costs <- data.frame(aggregate(total_listings_non_commercial$price, by=list(total_listings_non_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_costs) <- c("neighbourhood", "avg_cost")

neighbourhood_reviews <- data.frame(aggregate(total_listings_non_commercial$avg_review_value, by=list(total_listings_non_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_reviews) <- c("neighbourhood", "avg_review")

neighbourhood_scores <- data.frame(aggregate(total_listings_non_commercial$review_scores_rating, by=list(total_listings_non_commercial$neighbourhood_cleansed), FUN=mean))
names(neighbourhood_scores) <- c("neighbourhood", "avg_score")

neighbourhood_merged <- merge(neighbourhood_scores, neighbourhood_reviews, by="neighbourhood")
neighbourhood_merged <- merge(neighbourhood_merged, neighbourhood_costs, by="neighbourhood")
airbnb_neighbourhoods <- geojsonio::geojson_read("./all Listing Data/neighbourhoods.geojson", what = "sp")
pal <- colorNumeric("viridis", NULL)

airbnb_neighbourhoods <- sp::merge(airbnb_neighbourhoods, neighbourhood_merged, by="neighbourhood")

 rrsnc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average score for Non-Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

rrrnc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average review for Non-Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

rrcnc <- tags$div(
   HTML('<a href="https://cran.r-project.org/"> <img border="0" alt="Average cost for Non-Commercial Listings" src="/PathToImage/ImageR.jpeg" width="300" height="100"> </a>')
 )

p_non_commercial_score <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_score)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_score), opacity = 1.0,title = "score")

p_non_commercial_review <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_review)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_review), opacity = 1.0,title = "review")

p_non_commercial_cost <- leaflet(airbnb_neighbourhoods)%>%addTiles() %>%
  setView(lng = -73.7856491, lat = 40.7022541, zoom = 9) %>%
addPolygons(stroke = FALSE, smoothFactor = 0.3, fillOpacity = 1 ,fillColor = ~pal(avg_cost)) %>%
  leaflet::addLegend(pal = pal, values = ~(avg_cost), opacity = 1.0,title = "cost")
browsable(
  tagList(list(
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_commercial_score
    ),
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_non_commercial_score
    )
  ))
)
browsable(
  tagList(list(
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_commercial_review
    ),
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_non_commercial_review
    )
  ))
)
browsable(
  tagList(list(
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_commercial_cost
    ),
    tags$div(
      style = 'width:50%;display:block;float:left;',
      p_non_commercial_cost
    )
  ))
)
listings_gp = group_by(total_listings,is_commercial,neighbourhood_group_cleansed)
listings_sm = summarise(listings_gp,count=n())
colnames(listings_sm) = c("Listing_Type","Borough","Count")

sum_borough_commer = as.numeric(aggregate(listings_sm$Count,by=list(listings_sm$Listing_Type),sum)[1,2])
sum_borough_noncommer = as.numeric(aggregate(listings_sm$Count,by=list(listings_sm$Listing_Type),sum)[2,2])

for(i in 1:nrow(listings_sm)){
  if(listings_sm[i,"Listing_Type"]=="Commercial Listing")
    listings_sm[i,"Percentage"] <- (listings_sm[i,"Count"]*100)/sum_borough_commer
  else
    listings_sm[i,"Percentage"] <- (listings_sm[i,"Count"]*100)/sum_borough_noncommer
}

ggplot(data=listings_sm, aes(x=Listing_Type, y=Percentage, fill=Borough))  + geom_bar(stat="identity",position=position_dodge())  + xlab("Listing Type") + ylab("Percentage") + ggtitle("Distribution of Listings") + theme_minimal() + scale_fill_brewer(palette="Paired")

apartments = total_listings %>% filter(property_type=="Apartment")

apt_gp = group_by(apartments,is_commercial,room_type)
apt_ps = summarise(apt_gp,count=n())
colnames(apt_ps) = c("Listing_Type","Property_Type","Count")

sum_commer = as.numeric(aggregate(apt_ps$Count,by=list(apt_ps$Listing_Type),sum)[1,2])
sum_noncommer = as.numeric(aggregate(apt_ps$Count,by=list(apt_ps$Listing_Type),sum)[2,2])

for(i in 1:nrow(apt_ps)){
  if(apt_ps[i,"Listing_Type"]=="Commercial Listing")
    apt_ps[i,"Percentage"] <- (apt_ps[i,"Count"]*100)/sum_commer
  else
    apt_ps[i,"Percentage"] <- (apt_ps[i,"Count"]*100)/sum_noncommer
}

ggplot(data=apt_ps, aes(x=Listing_Type, y=Percentage, fill=Property_Type)) +
geom_bar(stat="identity", position=position_dodge()) + xlab("Listing_Type") + ylab("Percentage") + ggtitle("Distribution of Listings") + theme_minimal() + scale_fill_brewer(palette="Paired")

tgp = group_by(total_listing_1,is_commercial)
tsm = data.frame(summarise(tgp,count=n()))
colnames(tsm) = c("Listing Status","count")

tsm$Percentage = (tsm$count/sum(tsm$count))*100

ggplot(data=tsm, aes(x=reorder(tsm$`Listing Status`,tsm$Percentage), y=Percentage)) +
geom_bar(stat="identity", width = 0.3,fill='steelblue') + coord_flip()+ xlab("Listing Status") + ylab("Percentage") + ggtitle("Distribution of Listings") + theme_minimal() + scale_fill_brewer(palette="Paired")